001 /* 002 * FactorSequence.java 003 * 004 * Copyright 2003 Sergio Anibal de Carvalho Junior 005 * 006 * This file is part of NeoBio. 007 * 008 * NeoBio is free software; you can redistribute it and/or modify it under the terms of 009 * the GNU General Public License as published by the Free Software Foundation; either 010 * version 2 of the License, or (at your option) any later version. 011 * 012 * NeoBio is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; 013 * without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR 014 * PURPOSE. See the GNU General Public License for more details. 015 * 016 * You should have received a copy of the GNU General Public License along with NeoBio; 017 * if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330, 018 * Boston, MA 02111-1307, USA. 019 * 020 * Proper attribution of the author as the source of the software would be appreciated. 021 * 022 * Sergio Anibal de Carvalho Junior mailto:sergioanibaljr@users.sourceforge.net 023 * Department of Computer Science http://www.dcs.kcl.ac.uk 024 * King's College London, UK http://www.kcl.ac.uk 025 * 026 * Please visit http://neobio.sourceforge.net 027 * 028 * This project was supervised by Professor Maxime Crochemore. 029 * 030 */ 031 032 package neobio.alignment; 033 034 import java.io.Reader; 035 import java.io.BufferedReader; 036 import java.io.IOException; 037 038 /** 039 * This class builds a list of factors of a character sequence as induced by its 040 * Lempel-Ziv (LZ78) factorisation. Each factor is enconded as the longest factor 041 * previously seen plus one character. 042 * 043 * <P>The input can come from any source, provided it is encapsulated in a proper 044 * <CODE>Reader</CODE> instance. The stream is expected to be ready (i.e. the next 045 * <CODE>read</CODE> operation must return the first character of the sequence) and it is 046 * not closed when its end is reached, so the client is allowed to reset it and maybe use 047 * it for another purpose.</P> 048 * 049 * <P>Sequences can contain letters only although lines started with the 050 * <CODE>COMMENT_CHAR</CODE> character ('>') are regarded as comments and are completely 051 * skipped. White spaces (including tabs, line feeds and carriage returns) are also 052 * ignored throughout.</P> 053 * 054 * <P>This class uses a {@linkplain Trie} to keep track of a list of factors. Each node of 055 * the trie contains a {@linkplain Factor} of the text. As the sequence is read from the 056 * input, the trie is traversed as far as possible. When a leaf node is reached (which 057 * means that the longest prefix of the input has been found), two tasks are 058 * accomplished:</P> 059 * 060 * <UL> 061 * <LI>a new <CODE>Factor</CODE> is created with the character at the current position of 062 * the input and the leaf node's factor; 063 * <LI>a new node is added to the trie with the character at the current position of the 064 * input; 065 * </UL> 066 * 067 * <P>Each factor also receives a serial number according to the order they are found and 068 * a pointer to the next factor (in that order) for fast access. This pointer, together 069 * with the factor's ancestor pointer forms a doubly-linked list of factors. The original 070 * text can then be reconstructed simply by following the linked list and writing out its 071 * factors.</P> 072 * 073 * <P>As an example, the sequence <CODE>ACTAAACCGCATTAATAATAAAA</CODE> is parsed into the 074 * following 12 factors:</P> 075 * 076 * <CODE><BLOCKQUOTE><PRE> 077 * 0 ( , ) = empty 078 * 1 (0,A) = A 079 * 2 (0,C) = C 080 * 3 (0,T) = T 081 * 4 (1,A) = AA 082 * 5 (1,C) = AC 083 * 6 (2,G) = CG 084 * 7 (2,A) = CA 085 * 8 (3,T) = TT 086 * 9 (4,T) = AAT 087 * 10 (9,A) = AATA 088 * 11 (4,A) = AAA 089 * 090 * serial # (prefix, new char) = factor text 091 * </PRE></BLOCKQUOTE></CODE> 092 * 093 * <P>This class is used by {@linkplain CrochemoreLandauZivUkelson} algorithm to speed up 094 * the classic dynamic programming approach to sequence alignment.</P> 095 * 096 * @author Sergio A. de Carvalho Jr. 097 * @see Factor 098 * @see Trie 099 * @see CrochemoreLandauZivUkelson 100 */ 101 public class FactorSequence 102 { 103 /** 104 * The character used to start a comment line in a sequence file. When this character 105 * is found, the rest of the line is ignored. 106 */ 107 protected static final char COMMENT_CHAR = '>'; 108 109 /** 110 * A pointer to the root factor, the one that starts the list of factors. 111 */ 112 protected Factor root_factor; 113 114 /** 115 * The numbers of character represented by this sequence. 116 */ 117 protected int num_chars; 118 119 /** 120 * The numbers of factors generated by the LZ78 parsing of the sequence. 121 */ 122 protected int num_factors; 123 124 /** 125 * Creates a new instance of a <CODE>FactorSequence</CODE>, loading the sequence data 126 * from the <CODE>Reader</CODE> input stream. A doubly-linked list of factors is built 127 * according to its LZ78 factorisation. 128 * 129 * @param reader source of characters for this sequence 130 * @throws IOException if an I/O exception occurs when reading the input 131 * @throws InvalidSequenceException if the input does not contain a valid sequence 132 */ 133 public FactorSequence (Reader reader) 134 throws IOException, InvalidSequenceException 135 { 136 BufferedReader input = new BufferedReader(reader); 137 Trie root_node, current_node, new_node = null; 138 Factor current_factor, last_factor, new_factor; 139 int ch; 140 char c; 141 142 // create root factor and the root node of the trie 143 root_factor = new Factor (); 144 root_node = new Trie (root_factor); 145 num_factors = 1; 146 num_chars = 0; 147 148 current_node = root_node; 149 last_factor = root_factor; 150 151 // read characters from the input 152 while ((ch = input.read()) != -1) 153 { 154 c = (char) ch; 155 156 if (c == COMMENT_CHAR) 157 // it's a comment line: skip it! 158 input.readLine(); 159 160 // accept letters only 161 else if (Character.isLetter(c)) 162 { 163 num_chars++; 164 165 // walk down the trie as far as possible 166 new_node = current_node.spellDown(c); 167 168 if (new_node != null) 169 { 170 current_node = new_node; 171 } 172 else 173 { 174 // the longest factor of the input has been found, 175 // now create a new factor from the current node's factor 176 current_factor = (Factor) current_node.getData(); 177 new_factor = new Factor (current_factor, num_factors, c); 178 179 // add the new character to the trie as well 180 current_node.add (new_factor, c); 181 182 // set up a pointer from the last factor to the new one 183 last_factor.setNext (new_factor); 184 last_factor = new_factor; 185 186 // restart at the root of the trie 187 current_node = root_node; 188 189 num_factors++; 190 } 191 } 192 193 // anything else, except whitespaces, will throw an exception 194 else if (!Character.isWhitespace(c)) 195 throw new InvalidSequenceException 196 ("Sequences can contain letters only."); 197 } 198 199 // if new_node is not null, the last factor is actually 200 // not a new factor but a factor already created 201 if (new_node != null) 202 { 203 // no new node is created, just point the last_factor to an 204 // existing one that represents the last characters of the text 205 last_factor.setNext((Factor) new_node.getData()); 206 207 num_factors++; 208 } 209 210 // check if read anything useful! 211 if (num_factors <= 1) 212 throw new InvalidSequenceException ("Empty sequence."); 213 } 214 215 /** 216 * Returns the root factor, the one that starts the list of factors. 217 * 218 * @return root factor 219 */ 220 public Factor getRootFactor () 221 { 222 return root_factor; 223 } 224 225 /** 226 * Returns the number of factors produced by the LZ78 parsing of the text. 227 * 228 * @return number of factors 229 */ 230 public int numFactors() 231 { 232 return num_factors; 233 } 234 235 /** 236 * Returns the number of characters of the original sequence. 237 * 238 * @return number of characters of the original sequence 239 */ 240 public int numChars () 241 { 242 return num_chars; 243 } 244 245 /** 246 * Reconstructs the sequence from the list of factors induced by the LZ78 parsing of 247 * the text. 248 * 249 * @return the original sequence 250 */ 251 public String toString () 252 { 253 StringBuffer buf = new StringBuffer(); 254 Factor node; 255 256 node = root_factor.getNext(); 257 258 for (int i = 1; i < numFactors(); i++) 259 { 260 buf.append(node); 261 262 node = node.getNext(); 263 } 264 265 return buf.toString(); 266 } 267 268 /** 269 * Returns a string representation of the actual list of factors produced by the LZ78 270 * parsing of the text. Each factor is printed out in a separate line, in the order 271 * they appear in the text, with its serial number, its ancestor's serial number, its 272 * new character, length and a string representation of the factor itself. 273 * 274 * @return a string representation of the list of factors 275 */ 276 public String printFactors () 277 { 278 StringBuffer buf = new StringBuffer(); 279 Factor factor; 280 281 factor = root_factor.getNext(); 282 283 for (int i = 1; i < numFactors(); i++) 284 { 285 buf.append (factor.getSerialNumber() + "\t<"); 286 buf.append (factor.getAncestor().getSerialNumber() + " ,\t"); 287 buf.append (factor.getNewChar() + ">\t"); 288 buf.append (factor.length() + "\t" + factor + "\n"); 289 290 factor = factor.getNext(); 291 } 292 293 buf.append(numFactors() + " factors\n"); 294 295 return buf.toString(); 296 } 297 }